В рамках данной лабораторной работы вам предлагается проанализировать набор данных о студентах двух школ в Португалии.
В файле students_data.csv представлена информация о студентах, посещающих два курса - математику (Math) и поргутальский язык (Por). Некоторые студенты представлены в обоих курсах, некоторые - только в одном. Для каждого студента известны три оценки по курсу: оценка за первое полугодие (G1), оценка за второе полугодие (G2) и итоговая оценка за год (G3).
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings(action='ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import pylab as pl
from sklearn.metrics import roc_curve,auc,make_scorer,confusion_matrix, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error,accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.linear_model import Lasso, Ridge, LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,StratifiedKFold,KFold, GridSearchCV, cross_validate, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler,StandardScaler,label_binarize
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor,BaggingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree,export_graphviz,DecisionTreeRegressor
from IPython.display import Image
from IPython.display import display
from graphviz import Source
import os
os.environ["PATH"] += os.pathsep + 'C:\\Program Files\\Graphviz\\bin\\'
import xgboost as xgb
from xgboost import XGBClassifier
import multiprocessing
from xgboost import to_graphviz
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)
data = pd.read_csv("students_data.csv")
data.shape
data.head(15)
Данные представлены признаками различных типов: числовыми, категориальными, упорядоченными категориальными.
Описание признаков:
pd.read_csv('students_data_features.csv',
delimiter=';',
encoding='windows-1251')
G3) одинаково в обоих частях.Tip: Используйте свои наработки из Лабораторной работы №1.
for column in data.columns.tolist(): print(column, " | ", data[column].unique())
Среди представленных данных лишним является ID, так как он нам попросту не нужен. Я решил его просто удалить
Ошибки и опечатки в данных:
В колонке cheating больше Nan-значений чем всех остальных, так что я принял решение заменить все пустые значениея на 'no' (не пойман - не вор)
data.drop(["ID"], axis = 1, inplace = True)
data.sex.replace("M", "m", inplace=True)
data.Medu.replace("o", "0", inplace=True)
data.Fedu.replace("o", "0", inplace=True)
data.Pstatus.replace("t", "T", inplace=True)
data.replace("at-home", "at_home", inplace=True)
data.guardian.replace("futher", "father", inplace=True)
data.dropna(subset = ['romantic',"famrel","Dalc","Walc"], inplace = True)
data.cheating.fillna('no', inplace=True)
print(data['Fjob'].value_counts(),'\n')
print(data['Mjob'].value_counts())
В графах колонках Mjob и Fjob ничего не дающее нам значение 'other' встречается чаще всего. Следовательно данные от этих колонок не принесут нам пользы и их можно удалиить.
data = data.drop(["Fjob", "Mjob"], axis=1)
print(data.dtypes)
Неверными типами данных являются:
data['Medu'] = data['Medu'].apply(pd.to_numeric, errors='coerce',downcast='integer')
data['Fedu'] = data['Fedu'].apply(pd.to_numeric, errors='coerce',downcast='integer')
data['famrel'] = data.famrel.astype('int64')
data['Dalc'] = data.Dalc.astype('int64')
data['Walc'] = data.Walc.astype('int64')
abs(data.corr()['G3'])
Основной критерий оценки G3 почти не зависит от таких данных,как:
Следовательно, эти столбцы можно удалить
data = data.drop(["sex","Pstatus","activities", "famsup",
"nursery", "freetime","goout", "schoolsup",
"health", "famsize", "famrel", "paid"], axis=1)
data.replace("no", 0, inplace=True)
data.replace("yes", 1, inplace=True)
bin_sex = {"sex": {"M": 1, "F": 0}}
data.replace(bin_sex, inplace=True)
lg_3 = {"famsize": {"LE3": 0, "GT3": 1}}
data.replace(lg_3, inplace=True)
bin_pstat = {"Pstatus": {"A": 0, "T": 1}}
data.replace(bin_pstat, inplace=True)
bin_addr = {"address": {"R": 0, "U": 1}}
data.replace(bin_addr, inplace=True)
bin_sch = {"school": {"GP": 0, "MS": 1}}
data.replace(bin_sch, inplace=True)
bin_sub = {"Subject": {"Por": 0, "Math": 1}}
data.replace(bin_sub, inplace=True)
numerical = data.select_dtypes(exclude=['object'])
categorical = data.select_dtypes(include=['object'])
onehot = pd.get_dummies(categorical)
df = pd.concat([numerical, onehot], axis=1)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = df.drop(["G2", "G3"], axis=1) # с использованием G1
y = df["G3"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(len(X_train))
print(len(X_test))
df.head(10)
G3). При решении задачи нельзя использовать признак G2.Для решения задачи примените следующие методы:
Для каждого метода выполните настройку гиперпараметров на кросс-валидации.
G1 и без него. Сравните качество решений в двух случаях.def compute_metrics(y_test, y_pred,y_train,y_train_pred):
print("----------------Test:--------------------")
print('MSE:', mean_squared_error(y_test, y_pred))
print('MAE:', mean_absolute_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))
print('MAPE:', mean_absolute_percentage_error(y_test, y_pred))
print("----------------Train:-------------------")
print('MSE:', mean_squared_error(y_train, y_train_pred))
print('MAE:', mean_absolute_error(y_train, y_train_pred))
print('R2:', r2_score(y_train, y_train_pred))
print('MAPE:', mean_absolute_percentage_error(y_train,y_train_pred))
skfold = StratifiedKFold(n_splits=5, shuffle=True)
ridge = Ridge(alpha=0.01, max_iter=100000).fit(X_train, y_train)
print("RIDGE REGRESSION")
print("Training set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))
print("Mean Cross-Validation, Kfold: {:.2f}".format(np.mean(cross_val_score(ridge, X_train, y_train, cv=skfold,scoring = 'explained_variance'))))
param_grid = {'alpha': np.logspace(-3, 3, 7)}
grid = GridSearchCV(Ridge(), param_grid, cv=skfold,scoring = 'explained_variance', return_train_score=True)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
y_train_pred = grid.predict(X_train)
compute_metrics(y_test, y_pred,y_train, y_train_pred)
res = pd.DataFrame(grid.cv_results_)
res.plot("param_alpha", ["mean_train_score", "mean_test_score"], logx=True)
plt.title("Ridge grid search")
print(grid.best_params_, grid.best_score_)
lr = LinearRegression().fit(X_train, y_train) # Fit the training data to a regression line
print("LINEAR REGRESSION")
print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))
print("Mean Cross-Validation, Kfold: {:.2f}".format(np.mean(cross_val_score(lr, X_train, y_train, cv=skfold,scoring = 'explained_variance'))))
regressionModel = LinearRegression()
regressionModel.fit(X_train, y_train)
predictedDependentVariables = regressionModel.predict(X_train)
compute_metrics(y_test, regressionModel.predict(X_test),y_train,predictedDependentVariables)
sns.kdeplot(y_train, label='train')
sns.kdeplot(y_test, label='test')
sns.kdeplot(predictedDependentVariables, label='pred')
plt.legend()
plt.show()
poly = PolynomialFeatures(degree=2, include_bias=False)
poly.fit(X_train)
poly_train = poly.transform(X_train)
poly_test = poly.transform(X_test)
print('Original number of features:', X_train.shape[1])
print('Number of features after polynomial transformation:', poly_train.shape[1])
regressionModel = LinearRegression(normalize=True)
regressionModel.fit(poly_train, y_train)
compute_metrics(y_test, regressionModel.predict(poly_test),y_train,regressionModel.predict(poly_train))
sns.kdeplot(y_train, label='train')
sns.kdeplot(y_test, label='test')
sns.kdeplot(regressionModel.predict(poly_train), label='pred')
plt.legend()
poly = PolynomialFeatures(degree=2, include_bias=False)
poly.fit(X_train)
poly_train = poly.transform(X_train)
poly_test = poly.transform(X_test)
print('Original number of features:', X_train.shape[1])
print('Number of features after polynomial transformation:', poly_train.shape[1])
regressionModel = Ridge(normalize=True)
regressionModel.fit(poly_train, y_train)
compute_metrics(y_test, regressionModel.predict(poly_test),y_train,regressionModel.predict(poly_train))
sns.kdeplot(y_train, label='train')
sns.kdeplot(y_test, label='test')
sns.kdeplot(regressionModel.predict(poly_train), label='pred')
plt.legend()
plt.show()
poly = PolynomialFeatures(degree=3, include_bias=False)
poly.fit(X_train)
poly_train = poly.transform(X_train)
poly_test = poly.transform(X_test)
print('Original number of features:', X_train.shape[1])
print('Number of features after polynomial transformation:', poly_train.shape[1])
regressionModel = Ridge(normalize=True)
regressionModel.fit(poly_train, y_train)
compute_metrics(y_test, regressionModel.predict(poly_test),y_train,regressionModel.predict(poly_train))
sns.kdeplot(y_train, label='train')
sns.kdeplot(y_test, label='test')
sns.kdeplot(regressionModel.predict(poly_train), label='pred')
plt.legend()
plt.show()
def get_model_quality(test_label, preds):
print("Accuracy:",
round(sklearn.metrics.accuracy_score(test_label, preds), 5),
'\nBalanced accuracy:',
round(sklearn.metrics.balanced_accuracy_score(test_label, preds), 5))
print()
print(metrics.classification_report(test_label, preds))
knn = KNeighborsClassifier().fit(X_train, y_train)
print("KNN CLASSIFER")
print("Training set score: {:.2f}".format(knn.score(X_train, y_train)))
print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))
print("Mean Cross Validation, KFold: {:.2f}".format(np.mean(cross_val_score(knn, X_train, y_train, cv=skfold,scoring = 'explained_variance'))))
knn = KNeighborsRegressor().fit(X_train, y_train)
print("KNN Regressor")
print("Training set score: {:.2f}".format(knn.score(X_train, y_train)))
print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))
# Kfold Cross Validation
print("Mean Cross Validation, KFold: {:.2f}".format(np.mean(cross_val_score(knn, X_train, y_train, cv=skfold,scoring = 'explained_variance'))))
params = {
'kneighborsregressor__n_neighbors': [1,2,3,4,5,6,7,8,9,10,15,20,50,100]
}
kf = KFold(n_splits=10, shuffle=True)
knn = KNeighborsRegressor()
scaler = StandardScaler()
knrModel = make_pipeline(
scaler,
knn
)
gd = GridSearchCV(knrModel, params, cv=kf, scoring = 'explained_variance')
gd.fit(X_train, y_train)
print(gd.best_params_,gd.best_score_)
best = gd.best_estimator_
# best.fit(X_train_scaled, y_train)
y_pred = best.predict(X_test)
y_pred_train = best.predict(X_train)
compute_metrics(y_test, y_pred,y_train, y_pred_train)
sns.kdeplot(y_train, label='train')
sns.kdeplot(y_test, label='test')
sns.kdeplot(y_pred_train, label='pred')
plt.legend()
plt.show()
rf = RandomForestClassifier().fit(X_train, y_train)
print("KNN Regressor")
print("Training set score: {:.2f}".format(rf.score(X_train, y_train)))
print("Test set score: {:.2f}".format(rf.score(X_test, y_test)))
# Kfold Cross Validation
print("Mean Cross Validation, KFold: {:.2f}".format(np.mean(cross_val_score(rf, X_train, y_train, cv=skfold))))
param = {'n_estimators': range(50, 101, 25), 'max_depth': [30], 'min_samples_split': range(2, 11, 2)}
grid = RandomizedSearchCV(RandomForestClassifier(criterion='entropy'), param, n_iter=20, cv=skfold,scoring = 'explained_variance', verbose=4)
grid.fit(X_train, y_train)
best = grid.best_estimator_
y_pred = best.predict(X_test)
rf = RandomForestRegressor().fit(X_train, y_train)
print("Random Forest Regressor")
print("Training set score: {:.2f}".format(rf.score(X_train, y_train)))
print("Test set score: {:.2f}".format(rf.score(X_test, y_test)))
# Kfold Cross Validation
print("Mean Cross Validation, KFold: {:.2f}".format(np.mean(cross_val_score(rf, X_train, y_train, cv=skfold,scoring = 'explained_variance'))))
params_rf = {
'n_estimators': [50, 100, 200,300, 400, 500],
'max_depth': [4, 6, 8],
'min_samples_leaf': [.1, .2],
'max_features': ['log2', 'sqrt']
}
# Instantiate a random forests regressor 'rf'
rf = RandomForestRegressor(random_state = 17)
grid_rf = GridSearchCV(estimator=rf, param_grid = params_rf, cv = skfold, scoring='explained_variance', n_jobs = -1)
grid_rf.fit(X_train, y_train)
# Extract best hyperparameters from 'grid_df'...
best_hyperparams = grid_rf.best_params_
print('Best hyperparameters:\n', best_hyperparams)
# Extract best model from 'grid_rf'
print(grid_rf.best_score_)
best_model = grid_rf.best_estimator_
# Predict the test set labels...
y_pred = best_model.predict(X_test)
y_train_pred = best_model.predict(X_train)
compute_metrics(y_test, y_pred,y_train, y_train_pred)
params_dt = {
'max_depth': [3, 4, 5, 6],
'min_samples_leaf': [.04, .06, .08],
'max_features': [.2, .4, .6, .8]
}
SEED = 1
dt = DecisionTreeClassifier(random_state=SEED)
grid_dt= GridSearchCV(estimator=dt, param_grid = params_dt, cv = skfold, n_jobs=-1)
grid_dt.fit(X_train, y_train)
best_hyperparams = grid_dt.best_params_
print('Best hyperparameters:\n' , best_hyperparams)
best_score = grid_dt.best_score_
print('Best '.format(best_score))
best_model = grid_dt.best_estimator_
y_pred = best_model.predict(X_test)
bag = BaggingClassifier(n_estimators=30, base_estimator=DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=5))
bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
df.head()
X2 = df.drop(["G2", "G1"], axis=1) # без G1
X_m = X2[X2["Subject"] == 1].drop("G3", axis=1)
X_p = X2[X2["Subject"] == 0].drop("G3", axis=1)
y_m = X2[X2["Subject"] == 1]["G3"]
y_p = X2[X2["Subject"] == 0]["G3"]
x_train_m, x_test_m, y_train_m, y_test_m = train_test_split(X_m, y_m, test_size=0.2)
x_train_p, x_test_p, y_train_p, y_test_p = train_test_split(X_p, y_p, test_size=0.2)
X2.head()
print("Math",cross_val_score(Ridge(), X_m, y_m, cv=10))
print("Por",cross_val_score(Ridge(), X_p, y_p, cv=10),'\n')
param_grid = {'alpha': np.logspace(-3, 3, 7)}
grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True)
grid.fit(x_train_p, y_train_p)
y_pred = grid.predict(x_test_p)
y_train_pred = grid.predict(x_train_p)
print("Por:")
compute_metrics(y_test_p, y_pred,y_train_p, y_train_pred)
grid.fit(x_train_m, y_train_m)
y_pred = grid.predict(x_test_m)
y_train_pred = grid.predict(x_train_m)
print("Math:")
compute_metrics(y_test_m, y_pred,y_train_m, y_train_pred)
param_grid = {'alpha': np.logspace(-3, 3, 7)}
grid = GridSearchCV(Lasso(), param_grid, cv=10, return_train_score=True)
grid.fit(x_train_p, y_train_p)
y_pred = grid.predict(x_test_p)
y_train_pred = grid.predict(x_train_p)
print("Por:")
compute_metrics(y_test_p, y_pred,y_train_p, y_train_pred)
grid.fit(x_train_m, y_train_m)
y_pred = grid.predict(x_test_m)
y_train_pred = grid.predict(x_train_m)
print("Math:")
compute_metrics(y_test_m, y_pred,y_train_m, y_train_pred)
print("Math",cross_val_score(LinearRegression(), X_m, y_m, cv=10))
print("Por",cross_val_score(LinearRegression(), X_p, y_p, cv=10),'\n')
regressionModel = LinearRegression()
regressionModel.fit(x_train_p, y_train_p)
y_pred = regressionModel.predict(x_test_p)
y_train_pred = regressionModel.predict(x_train_p)
print("Por:")
compute_metrics(y_test_p, y_pred,y_train_p, y_train_pred)
regressionModel.fit(x_train_m, y_train_m)
y_pred = regressionModel.predict(x_test_m)
y_train_pred = regressionModel.predict(x_train_m)
print("Math:")
compute_metrics(y_test_m, y_pred,y_train_m, y_train_pred)
print("Math",cross_val_score(KNeighborsRegressor(), X_m, y_m, cv=10))
print("Por",cross_val_score(KNeighborsRegressor(), X_p, y_p, cv=10), '\n')
parameters = {'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
'metric': ['manhattan', 'minkowski', 'euclidean'], 'n_neighbors': range(3, 9)}
estimator = KNeighborsRegressor()
model = GridSearchCV(estimator, parameters, cv=10, n_jobs = -1)
model.fit(x_train_p, y_train_p)
print("Por")
print(model.best_score_)
model.fit(x_train_m, y_train_m)
print("Math")
print(model.best_score_)
print("Math",cross_val_score(DecisionTreeRegressor(random_state=17), X_m, y_m, cv=10))
print("Por",cross_val_score(DecisionTreeRegressor(random_state=17), X_p, y_p, cv=10))
parameters = {'max_depth': range(2, 6)}
tree = DecisionTreeRegressor(random_state=17)
model = GridSearchCV(tree, parameters, cv=10, n_jobs = -1)
model.fit(x_train_p, y_train_p)
print("Por")
print(model.best_score_)
model.fit(x_train_m, y_train_m)
print("Math")
print(model.best_score_)
В первом варианте(с G1) точность выше и результаты лучше
Решите задачу бинарной классификации: постройте модель, предсказывающую, сдаст студент предмет (G3 >= 8) или не сдаст (G3 < 8).
При решении задачи нельзя использовать признаки G1 и G2.
bin_df = df.copy()
for index, row in bin_df.iterrows():
if (bin_df.at[index, 'G3'] < 8):
bin_df.at[index, 'G3'] = 0
else:
bin_df.at[index, 'G3'] = 1
bin_df.head()
X = bin_df.drop(["G1", "G2", "G3"], axis = 1)
y =bin_df["G3"]
feature_names = X.columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state = 17, stratify=y)
X.head()
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))
cvs = cross_val_score(DecisionTreeClassifier(random_state=17), X, y, cv=5)
print(cvs)
print("\n\n", cvs.mean())
model = DecisionTreeClassifier(max_depth = 5, random_state=17).fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)
print(roc_auc_score(y_test, y_pred_proba[:, 1]))
model
params_dt = {
'max_depth': [5],
'min_samples_leaf': [.04, .06, .08],
'max_features': [.2, .4, .6, .8]
}
SEED = 1
dt = DecisionTreeClassifier(random_state=SEED)
grid_dt= GridSearchCV(estimator=dt, param_grid = params_dt, cv = 5, n_jobs=-1)
# Fitting the grid_dt...
grid_dt.fit(X_train, y_train)
# Extract best hyperparameters from 'grid_dt'
best_hyperparams = grid_dt.best_params_
print('Best hyperparameters:\n' , best_hyperparams)
# Extract best CV score from 'gird_dt'
best_score = grid_dt.best_score_
print('Best ',(best_score))
# Extract best model from 'grid_dt'
best_model = grid_dt.best_estimator_
graph = Source(export_graphviz(best_model, out_file=None, feature_names=feature_names))
png = graph.pipe(format='png')
Image(png)
На кросс-валидации (5-fold из 2 повторений) оцените, как меняется качество модели Random Forest с ростом числа деревьев (при дефолтных значениях остальных параметров). Провизуализируйте результаты. Сколько деревьев достаточно в данном случае и почему?
NB: В сравнение включите конфигурацию, аналогичную простому дереву решений.
n_estimators= [1,2,3,5,10,20,50,100]
for i in n_estimators:
cvs = cross_val_score(RandomForestClassifier(n_estimators=i), X, y, cv=5,n_jobs=2)
print( cvs.mean())
print('Train:',model.score(X_train, y_train))
print('Test:',model.score(X_test, y_test))
model = RandomForestClassifier(n_estimators=i).fit(X, y)
graph = Source(export_graphviz(model.estimators_[0], out_file=None, feature_names=feature_names))
png = graph.pipe(format='png')
display(Image(png))
cvs.mean() растет с увеличением деревьем, но для 50 и 100 деревьев результаты схожи.
Использовать 50 и 100 деревьев будет неоптимально
%%time
f1_scorer = make_scorer(f1_score, pos_label="yes")
params_rf = {'n_estimators': [5,10,20,50, 100, 200], 'max_depth': range(2, 7), 'min_samples_split': range(2, 11, 2)}
rf = RandomForestClassifier(random_state=17, oob_score = True)
grid_rf = GridSearchCV(estimator=rf, param_grid = params_rf, cv = 5, scoring=f1_scorer,
verbose=1, n_jobs = 2)
grid_rf.fit(X_train, y_train)
def draw_roc_auc_curve(y_test, y_pred_proba,title):
fpr, tpr, treshold = roc_curve(y_test, y_pred_proba[:,1])
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, color='darkorange',
label='ROC кривая (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'{title} ROC-кривой')
plt.legend(loc="lower right")
plt.show()
best_hyperparams = grid_rf.best_params_
print('Best hyperparameters:\n', best_hyperparams)
best_model_rf = grid_rf.best_estimator_
y_pred = best_model_rf.predict(X_test)
print('F1: ',f1_score(y_test, y_pred))
y_pred_proba = best_model.predict_proba(X_test)
roc_auc_log = roc_auc_score(y_test, y_pred_proba[:,1])
draw_roc_auc_curve(y_test, y_pred_proba,"RandomForestClassifier")
estimator = best_model_rf.estimators_
f1=[]
for tree_in_forest in estimator:
f1.append(f1_score(y_test, tree_in_forest.predict(X_test)))
print(f1)
fig = plt.figure(figsize=(5, 5))
plt.plot([x for x in range(1, len(f1) + 1)], f1, "*")
plt.xlabel('Trees')
plt.ylabel('F1 score')
Relative_Feature_importance = pd.Series(best_model_rf.feature_importances_, feature_names).sort_values(ascending=False)
Relative_Feature_importance.plot(kind='bar', title='Order of Feature Importance')
plt.ylabel('Feature Importance')
plt.show()
from sklearn.linear_model import LogisticRegression
parameters = {'penalty' : ['l1', 'l2'], 'C' : np.logspace(-4, 4, 20)}
logit = GridSearchCV(LogisticRegression(random_state=0),parameters,cv = 5, verbose=True, n_jobs=2)
logit.fit(X_train, y_train)
best_hyperparams = logit.best_params_
print('Best hyperparameters:\n', best_hyperparams)
best_model = logit.best_estimator_
# Predict the test set labels...
y_pred = best_model.predict(X_test)
roc_auc_log = roc_auc_score(y_test, y_pred)
from sklearn.metrics import roc_curve, auc
y_pred_proba = best_model_rf.predict_proba(X_test)
roc_auc_rf = roc_auc_score(y_test, y_pred_proba[:,1])
draw_roc_auc_curve(y_test, y_pred_proba,"Random Forest")
y_pred_proba = best_model.predict_proba(X_test)
roc_auc_log = roc_auc_score(y_test, y_pred_proba[:,1])
draw_roc_auc_curve(y_test, y_pred_proba,"Logistic Regression")
print(f"Roc_auc LogisticRegression = {roc_auc_log}\nRoc_auc Random Forest = {roc_auc_rf}")
n_estimators= [1,2,3,5,10]
for i in n_estimators:
cvs = cross_val_score(XGBClassifier(n_estimators=i), X, y, cv=5,n_jobs=2)
print( cvs.mean())
model = XGBClassifier(n_estimators=i).fit(X, y)
print('Train:',model.score(X_train, y_train))
print('Test:',model.score(X_test, y_test))
xgb.to_graphviz(model, num_trees=0, rankdir='LR')
graph = Source(xgb.to_graphviz(model, num_trees=0, rankdir='LR'))
png = graph.pipe(format='png')
display(Image(png))
param_grid = [
{'xgbclassifier__n_estimators': [1000],
'xgbclassifier__learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
'xgbclassifier__max_depth': [3, 6, 9],
'xgbclassifier__min_child_weight': [1, 3, 5],
'xgbclassifier__objective': ['multi:softmax'],
'xgbclassifier__gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5] }
]
grid_model_xgb = GridSearchCV(
estimator= XGBClassifier(),
param_grid=param_grid,
n_jobs=multiprocessing.cpu_count(),
cv=3,
verbose=True,
scoring=f1_scorer
)
grid_model_xgb.fit(X_train, y_train)
best_hyperparams = grid_model_xgb.best_params_
print('Best hyperparameters:\n', best_hyperparams)
best_model_rf = grid_model_xgb.best_estimator_
# Predict the test set labels...
y_pred = best_model_rf.predict(X_test)
f1_score(y_test, y_pred)
Решите задачу многоклассовой классификации: постройте модель, пресдказывающую оценку студента по предмету по 4 балльной шкале
G3 <= 20G3 <= 17G3 <= 13G3 < 8 При решении задачи нельзя использовать признаки G1 и G2.
Для решения задачи примените следующие методы:
На кросс-валидации подберите оптимальные значения гиперпараметров алгоритмов.
from sklearn.metrics import confusion_matrix
import pylab as pl
multi_df = df.copy()
for index, row in multi_df.iterrows():
if (multi_df.at[index, 'G3'] < 8):
multi_df.at[index, 'G3'] = 0
if (multi_df.at[index, 'G3'] < 14 and multi_df.at[index, 'G3'] > 7 ):
multi_df.at[index, 'G3'] = 1
if (multi_df.at[index, 'G3'] <18 and multi_df.at[index, 'G3'] >13 ):
multi_df.at[index, 'G3'] = 2
if (multi_df.at[index, 'G3'] > 17):
multi_df.at[index, 'G3'] = 3
multi_df.head(15)
X = multi_df.drop(["G1", "G2", "G3"], axis = 1)
y = multi_df["G3"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state = 17, stratify=y)
log_reg = LogisticRegression(max_iter = 10000, C=0.1)
log_reg.fit(X_train, y_train)
y_pred=log_reg.predict(X_test)
ax= plt.subplot()
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, ax = ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
plt.show()
from sklearn.model_selection import StratifiedKFold
c_values = np.linspace(1e-3, 1, 100)
penal = ['l1', 'l2']
tol = np.linspace(1e-4, 1, 8)
parameters = {'C': c_values, 'penalty': penal, 'tol': tol}
estimator = LogisticRegression(random_state=17, class_weight = 'balanced')
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17)
model = GridSearchCV(estimator, parameters, cv=skf, scoring='accuracy', n_jobs = -1)
model.fit(X, y)
model.best_estimator_
best = model.best_estimator_
best.fit(X_train, y_train)
y_pred=best.predict(X_test)
preds_prob = best.predict_proba(X_test)
ax= plt.subplot()
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, ax = ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
y_val_bin = label_binarize(y_test, classes=np.arange(y_train.nunique()))
plt.show()
model.best_score_
def metric():
metrics = ['euclidean', 'minkowski', 'manhattan','correlation']
acc = []
for i in metrics:
parameters = {'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'metric': [f'{i}'],
'n_neighbors': range(3, 11)}
estimator = KNeighborsClassifier()
model = GridSearchCV(estimator, parameters, cv=skf, scoring='accuracy', n_jobs = -1)
model.fit(X, y)
acc.append(model.best_score_)
print(acc,metrics)
fig = plt.figure(figsize=(13, 10))
plt.plot(metrics, acc, "*")
plt.xlabel('Metrics')
plt.ylabel('Accuracy score')
print(f'Maximal accuracy score: {max(acc)}')
parameters = {'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
'metric': ['euclidean', 'minkowski', 'manhattan','correlation'],
'n_neighbors': range(3, 11)}
estimator = KNeighborsClassifier()
model = GridSearchCV(estimator, parameters, cv=skf, scoring='accuracy', n_jobs = -1)
model.fit(X, y)
model.best_score_
best = model.best_estimator_
best.fit(X_train, y_train)
y_pred=best.predict(X_test)
preds_prob = best.predict_proba(X_test)
ax= plt.subplot()
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, ax = ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
plt.show()
acc = []
for i in range(3, 10):
estimator = DecisionTreeClassifier(random_state=17, max_depth = i).fit(X_train, y_train)
acc.append(accuracy_score(estimator.predict(X_test), y_test))
fig = plt.figure(figsize=(13, 5))
plt.plot(range(3, 10), acc, "*")
plt.xlabel('Max depth')
plt.ylabel('Accuracy score')
print(f'Maximal accuracy score: {max(acc)}')
md = range(3, 10)
parameters = {'max_depth': md}
estimator = DecisionTreeClassifier(random_state=17)
clf = GridSearchCV(estimator, parameters, scoring='accuracy', cv=skf, n_jobs = -1)
clf.fit(X, y)
clf.best_estimator_
best = clf.best_estimator_
best.fit(X_train, y_train)
y_pred=best.predict(X_test)
preds_prob = best.predict_proba(X_test)
print(clf.best_score_)
ax= plt.subplot()
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, ax = ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
plt.show()
rf = RandomForestClassifier(random_state=17, oob_score = True).fit(X, y)
rf.oob_score_
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(X.columns, rf.feature_importances_)
ax.set_yticklabels(X.columns)
ax.set_yticks(X.columns)
ax.set_title("Random Forest Feature Importances")
fig.tight_layout()
plt.show()
n_estimators = [5,10,20,50, 100, 200, 300]
md = range(3, 8)
parameters = {'n_estimators': n_estimators, 'max_depth': md}
estimator = RandomForestClassifier(random_state=17, oob_score = True)
clf = GridSearchCV(estimator, parameters, scoring='accuracy', cv=skf, n_jobs = -1)
clf.fit(X_train, y_train)
clf.best_estimator_
best = clf.best_estimator_
best.fit(X_train, y_train)
y_pred=best.predict(X_test)
preds_prob = best.predict_proba(X_test)
print(clf.best_score_)
ax= plt.subplot()
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, ax = ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
plt.show()
score = []
for n in n_estimators:
rf = RandomForestClassifier(random_state=17, n_estimators=n, oob_score = True, max_depth=5).fit(X, y)
score.append(rf.oob_score_)
plt.plot(n_estimators, score, "*")
plt.xlabel('n_estimators')
plt.ylabel('Score')
print(f'Maximal score: {max(score)}')
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
def GBC_model(clf):
clf.fit(X_train, y_train)
probab_of_predict = clf.predict_proba(X_test)
predict_train = clf.predict(X_test)
cv_score = cross_val_score(clf, X, y, cv = 5, scoring="accuracy")
print(cv_score)
print("----------------------Model performance-----------------------")
print("Accuracy score: ", accuracy_score(y_test, predict_train))
print("CV score: Mean - {}, Max - {}, Min - {}, Std - {}".format(np.mean(cv_score), np.max(cv_score),
np.min(cv_score), np.std(cv_score)))
Relative_Feature_importance = pd.Series(clf.feature_importances_, X.columns).sort_values(ascending=False)
Relative_Feature_importance.plot(kind='bar', title='Order of Feature Importance')
plt.ylabel('Feature Importance')
plt.show()
clf = GradientBoostingClassifier(learning_rate =0.25,random_state=17)
GBC_model(clf)